from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
import pandas as pd

from preprocess import conv_dict, conv_data


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(['ID'], axis=1)
test = test.drop(['ID'], axis=1)

null_index = train["Credit_Product"].isnull()
test_null = train.loc[null_index.index[null_index]]
test_null.fillna(value='Missing', inplace=True)

y_null = np.array(test_null['Is_Lead'])
x_null = np.array(test_null.drop('Is_Lead', 1))

train.fillna(value='Missing', inplace=True)
test.fillna(value='Missing', inplace=True)
print(train.info())
print(test.info())
y = np.array(train['Is_Lead'])
x = np.array(train.drop('Is_Lead', 1))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=18)

categories = [0, 2, 3, 4, 6, 8]
train_dataset = Pool(data=x_train,
                     label=y_train,
                     cat_features=categories)

eval_dataset = Pool(data=x_test,
                    label=y_test,
                    cat_features=categories)

model = CatBoostClassifier(iterations=5000,
                           early_stopping_rounds=500,
                           learning_rate=0.01,
                           loss_function='Logloss',
                           logging_level='Verbose',
                           random_seed=18,
                           metric_period=500,
                           max_depth=10,
                           # l2_leaf_reg=10,
                           # border_count=128,
                           # max_ctr_complexity=5,
                           # bagging_temperature=0,
                           # random_strength=10,
                           # simple_ctr=['Borders', 'Buckets', 'FloatTargetMeanValue', 'FeatureFreq'],
                           simple_ctr=['Borders:CtrBorderType=UniformAndQuantiles:CtrBorderCount=5',
                                       'Buckets:CtrBorderType=UniformAndQuantiles:CtrBorderCount=5',
                                       'FeatureFreq:CtrBorderType=MinEntropy:CtrBorderCount=5',
                                       'FloatTargetMeanValue:CtrBorderType=MinEntropy:CtrBorderCount=5'],
                           combinations_ctr=['FloatTargetMeanValue:CtrBorderCount=5:CtrBorderType=Uniform',
                                             'Buckets:CtrBorderCount=5:CtrBorderType=Uniform',
                                             'FeatureFreq:CtrBorderCount=5:CtrBorderType=Median',
                                             'FloatTargetMeanValue:CtrBorderCount=5:CtrBorderType=Uniform'],
                           # nan_mode='Max',
                           boosting_type='Ordered',
                           od_type='Iter',
                           eval_metric='AUC:type=Classic',
                           task_type='GPU',
                           devices='0:1',
                           # ignored_features=[0, 2],
                           )

model.fit(train_dataset, eval_set=eval_dataset, logging_level='Verbose')
preds_class = model.predict(eval_dataset)
preds_probs = model.predict_proba(eval_dataset)[:, 1]

print(model.feature_importances_)
print('AUC: %.4f' % metrics.roc_auc_score(y_test, preds_probs))
print('ACC: %.4f' % metrics.accuracy_score(y_test, preds_class))
print('Recall: %.4f' % metrics.recall_score(y_test, preds_class))
print('F1-score: %.4f' % metrics.f1_score(y_test, preds_class))
print('Precesion: %.4f' % metrics.precision_score(y_test, preds_class))

test_arr = np.array(test)
preds_test = model.predict_proba(test_arr)[:, 1]

preds_null = model.predict_proba(x_null)[:, 1]
print('NULL-AUC: %.4f' % metrics.roc_auc_score(y_null, preds_null))

np.savetxt('./catboost_ans.csv', preds_test, fmt='%.8f', delimiter=',')
